{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-04T01:44:39.965854Z",
     "iopub.status.busy": "2021-02-04T01:44:39.965091Z",
     "iopub.status.idle": "2021-02-04T01:44:42.798749Z",
     "shell.execute_reply": "2021-02-04T01:44:42.799416Z"
    },
    "papermill": {
     "duration": 2.849639,
     "end_time": "2021-02-04T01:44:42.799780",
     "exception": false,
     "start_time": "2021-02-04T01:44:39.950141",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type='text/css'>\n",
       ".datatable table.frame { margin-bottom: 0; }\n",
       ".datatable table.frame thead { border-bottom: none; }\n",
       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
       ".datatable .bool    { background: #DDDD99; }\n",
       ".datatable .object  { background: #565656; }\n",
       ".datatable .int     { background: #5D9E5D; }\n",
       ".datatable .float   { background: #4040CC; }\n",
       ".datatable .str     { background: #CC4040; }\n",
       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
       ".datatable .footer { font-size: 9px; }\n",
       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
       "</style>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "import lightgbm as lgb \n",
    "from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder\n",
    "from sklearn.metrics import log_loss\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import gc\n",
    "from scipy import sparse\n",
    "import datatable as dt\n",
    "from catboost import CatBoostClassifier, Pool\n",
    "import optuna\n",
    "import sklearn.datasets\n",
    "import sklearn.metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-04T01:44:42.817747Z",
     "iopub.status.busy": "2021-02-04T01:44:42.816992Z",
     "iopub.status.idle": "2021-02-04T01:44:42.831771Z",
     "shell.execute_reply": "2021-02-04T01:44:42.832275Z"
    },
    "papermill": {
     "duration": 0.025109,
     "end_time": "2021-02-04T01:44:42.832507",
     "exception": false,
     "start_time": "2021-02-04T01:44:42.807398",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def reduce_mem_usage(df):\n",
    "    \"\"\" \n",
    "    iterate through all the columns of a dataframe and \n",
    "    modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(('Memory usage of dataframe is {:.2f}' \n",
    "                     'MB').format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == 'int':\n",
    "                if c_min > np.iinfo(np.int8).min and c_max <\\\n",
    "                  np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max <\\\n",
    "                   np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max <\\\n",
    "                   np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max <\\\n",
    "                   np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if c_min > np.finfo(np.float16).min and c_max <\\\n",
    "                   np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max <\\\n",
    "                   np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype('category')\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(('Memory usage after optimization is: {:.2f}' \n",
    "                              'MB').format(end_mem))\n",
    "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) \n",
    "                                             / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-04T01:44:42.850353Z",
     "iopub.status.busy": "2021-02-04T01:44:42.849569Z",
     "iopub.status.idle": "2021-02-04T01:45:55.326107Z",
     "shell.execute_reply": "2021-02-04T01:45:55.325407Z"
    },
    "papermill": {
     "duration": 72.487052,
     "end_time": "2021-02-04T01:45:55.326267",
     "exception": false,
     "start_time": "2021-02-04T01:44:42.839215",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "读取数据...\n",
      "读取结束\n",
      "Data preprocessing...\n",
      "['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74', 'feature_75', 'feature_76', 'feature_77', 'feature_78', 'feature_79', 'feature_80', 'feature_81', 'feature_82', 'feature_83', 'feature_84', 'feature_85', 'feature_86', 'feature_87', 'feature_88', 'feature_89', 'feature_90', 'feature_91', 'feature_92', 'feature_93', 'feature_94', 'feature_95', 'feature_96', 'feature_97', 'feature_98', 'feature_99', 'feature_100', 'feature_101', 'feature_102', 'feature_103', 'feature_104', 'feature_105', 'feature_106', 'feature_107', 'feature_108', 'feature_109', 'feature_110', 'feature_111', 'feature_112', 'feature_113', 'feature_114', 'feature_115', 'feature_116', 'feature_117', 'feature_118', 'feature_119', 'feature_120', 'feature_121', 'feature_122', 'feature_123', 'feature_124', 'feature_125', 'feature_126', 'feature_127', 'feature_128', 'feature_129']\n",
      "Memory usage of dataframe is 1660.47MB\n",
      "Memory usage after optimization is: 428.60MB\n",
      "Decreased by 74.2%\n",
      "Done!\n",
      "CPU times: user 23.1 s, sys: 33.3 s, total: 56.4 s\n",
      "Wall time: 1min 12s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print('读取数据...')\n",
    "train = pd.read_feather('../input/janestreettrainfeather/train.feather')\n",
    "print('读取结束')\n",
    "print('Data preprocessing...')\n",
    "\n",
    "train = train.query('date > 85').reset_index(drop = True)   # 只保留第86天及以后的data\n",
    "train = train[train['weight'] > 0]\n",
    "train['action'] =  (train['resp'] > 0 ).astype('int')\n",
    "features = []\n",
    "for item in train.columns:\n",
    "#    if 'feature' in item or 'weight' in item:\n",
    "    if 'feature' in item:\n",
    "        features.append(item)\n",
    "print(features)\n",
    "features_mean = train.loc[:, features].mean()\n",
    "train.fillna(train.mean(),inplace=True) \n",
    "reduce_mem_usage(train)\n",
    "print('Done!')\n",
    "VALID_DAYS = 50  # using for valid\n",
    "#resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']\n",
    "df_train = train[train['date'] <= 499-VALID_DAYS]\n",
    "df_test = train[train['date'] > 499-VALID_DAYS]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-04T01:45:55.354101Z",
     "iopub.status.busy": "2021-02-04T01:45:55.353432Z",
     "iopub.status.idle": "2021-02-04T01:45:55.357026Z",
     "shell.execute_reply": "2021-02-04T01:45:55.356246Z"
    },
    "papermill": {
     "duration": 0.022015,
     "end_time": "2021-02-04T01:45:55.357194",
     "exception": false,
     "start_time": "2021-02-04T01:45:55.335179",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def objective(trial):\n",
    "    x_train = df_train[features]\n",
    "    x_val = df_test[features]\n",
    "    y_train = df_train['action']\n",
    "    y_val = df_test['action']\n",
    "    dtrain = lgb.Dataset(x_train, label=y_train)\n",
    "\n",
    "    param = {\n",
    "        \"objective\": \"binary\",\n",
    "        \"metric\": \"binary_logloss\",\n",
    "        \"verbosity\": -1,\n",
    "        \"boosting_type\": \"gbdt\",\n",
    "        \"lambda_l1\": trial.suggest_float(\"lambda_l1\", 1e-8, 10.0, log=True),\n",
    "        \"lambda_l2\": trial.suggest_float(\"lambda_l2\", 1e-8, 10.0, log=True),\n",
    "        \"num_leaves\": trial.suggest_int(\"num_leaves\", 2, 256),\n",
    "        \"feature_fraction\": trial.suggest_float(\"feature_fraction\", 0.4, 1.0),\n",
    "        \"bagging_fraction\": trial.suggest_float(\"bagging_fraction\", 0.4, 1.0),\n",
    "        \"bagging_freq\": trial.suggest_int(\"bagging_freq\", 1, 7),\n",
    "        \"min_child_samples\": trial.suggest_int(\"min_child_samples\", 5, 100),\n",
    "    }\n",
    "\n",
    "    gbm = lgb.train(param, dtrain)\n",
    "    preds = gbm.predict(x_val)\n",
    "    pred_labels = np.rint(preds)\n",
    "    accuracy = sklearn.metrics.accuracy_score(y_val, pred_labels)\n",
    "    return accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-04T01:45:55.382816Z",
     "iopub.status.busy": "2021-02-04T01:45:55.381727Z",
     "iopub.status.idle": "2021-02-04T01:52:57.602804Z",
     "shell.execute_reply": "2021-02-04T01:52:57.602212Z"
    },
    "papermill": {
     "duration": 422.236884,
     "end_time": "2021-02-04T01:52:57.602956",
     "exception": false,
     "start_time": "2021-02-04T01:45:55.366072",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m[I 2021-02-04 01:45:55,379]\u001b[0m A new study created in memory with name: no-name-740db11a-3237-43e1-b304-7f320fbdddea\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:46:49,199]\u001b[0m Trial 0 finished with value: 0.5205459721422036 and parameters: {'lambda_l1': 2.0486262267193102e-07, 'lambda_l2': 1.0428604606646677e-05, 'num_leaves': 228, 'feature_fraction': 0.8439041586800065, 'bagging_fraction': 0.6906701287512585, 'bagging_freq': 6, 'min_child_samples': 83}. Best is trial 0 with value: 0.5205459721422036.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:47:27,821]\u001b[0m Trial 1 finished with value: 0.523583386992917 and parameters: {'lambda_l1': 6.054859218175769, 'lambda_l2': 0.004316957685744817, 'num_leaves': 127, 'feature_fraction': 0.40972157720093133, 'bagging_fraction': 0.7644731999899101, 'bagging_freq': 7, 'min_child_samples': 55}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:48:09,155]\u001b[0m Trial 2 finished with value: 0.5212788490866574 and parameters: {'lambda_l1': 0.07527833495421682, 'lambda_l2': 0.029781504664867495, 'num_leaves': 104, 'feature_fraction': 0.8999995238734342, 'bagging_fraction': 0.5441595083163666, 'bagging_freq': 6, 'min_child_samples': 17}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:48:53,639]\u001b[0m Trial 3 finished with value: 0.5227530755413969 and parameters: {'lambda_l1': 5.310264974104281e-08, 'lambda_l2': 0.0027588607592214797, 'num_leaves': 130, 'feature_fraction': 0.7425727483907769, 'bagging_fraction': 0.795486452019172, 'bagging_freq': 4, 'min_child_samples': 67}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:49:26,512]\u001b[0m Trial 4 finished with value: 0.5205629172738672 and parameters: {'lambda_l1': 0.05157475115361014, 'lambda_l2': 5.183166765315354e-07, 'num_leaves': 62, 'feature_fraction': 0.8282129348702065, 'bagging_fraction': 0.4122208327184637, 'bagging_freq': 1, 'min_child_samples': 16}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:50:25,580]\u001b[0m Trial 5 finished with value: 0.5230369064967635 and parameters: {'lambda_l1': 0.0025232942535502755, 'lambda_l2': 3.7659650094901864, 'num_leaves': 104, 'feature_fraction': 0.9289328985366875, 'bagging_fraction': 0.6908064521818567, 'bagging_freq': 1, 'min_child_samples': 5}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:51:02,183]\u001b[0m Trial 6 finished with value: 0.5222065950452435 and parameters: {'lambda_l1': 1.057582944354738e-08, 'lambda_l2': 3.177014566389053e-05, 'num_leaves': 177, 'feature_fraction': 0.43011117736198595, 'bagging_fraction': 0.6101857970306936, 'bagging_freq': 1, 'min_child_samples': 99}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:51:36,357]\u001b[0m Trial 7 finished with value: 0.5227742569559766 and parameters: {'lambda_l1': 2.0807894184525026, 'lambda_l2': 0.03246044408800279, 'num_leaves': 18, 'feature_fraction': 0.5419805571939607, 'bagging_fraction': 0.5277972086878301, 'bagging_freq': 6, 'min_child_samples': 48}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:52:20,342]\u001b[0m Trial 8 finished with value: 0.522854746331379 and parameters: {'lambda_l1': 3.9839742664482847, 'lambda_l2': 1.1625160364767405e-07, 'num_leaves': 121, 'feature_fraction': 0.717468919020778, 'bagging_fraction': 0.6666458460830307, 'bagging_freq': 6, 'min_child_samples': 81}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n",
      "\u001b[32m[I 2021-02-04 01:52:57,581]\u001b[0m Trial 9 finished with value: 0.5232487206425593 and parameters: {'lambda_l1': 5.814673488474365, 'lambda_l2': 1.4015026534768256e-06, 'num_leaves': 39, 'feature_fraction': 0.6770964622421864, 'bagging_fraction': 0.9810233732384347, 'bagging_freq': 7, 'min_child_samples': 13}. Best is trial 1 with value: 0.523583386992917.\u001b[0m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of finished trials: 10\n",
      "Best trial:\n",
      "  Value: 0.523583386992917\n",
      "  Params: \n",
      "    lambda_l1: 6.054859218175769\n",
      "    lambda_l2: 0.004316957685744817\n",
      "    num_leaves: 127\n",
      "    feature_fraction: 0.40972157720093133\n",
      "    bagging_fraction: 0.7644731999899101\n",
      "    bagging_freq: 7\n",
      "    min_child_samples: 55\n"
     ]
    }
   ],
   "source": [
    "if __name__ == \"__main__\":\n",
    "    study = optuna.create_study(direction=\"maximize\")\n",
    "    study.optimize(objective, n_trials=10)\n",
    "\n",
    "    print(\"Number of finished trials: {}\".format(len(study.trials)))\n",
    "\n",
    "    print(\"Best trial:\")\n",
    "    trial = study.best_trial\n",
    "\n",
    "    print(\"  Value: {}\".format(trial.value))\n",
    "\n",
    "    print(\"  Params: \")\n",
    "    for key, value in trial.params.items():\n",
    "        print(\"    {}: {}\".format(key, value))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-04T01:52:57.631553Z",
     "iopub.status.busy": "2021-02-04T01:52:57.630849Z",
     "iopub.status.idle": "2021-02-04T01:59:23.485857Z",
     "shell.execute_reply": "2021-02-04T01:59:23.485255Z"
    },
    "papermill": {
     "duration": 385.870493,
     "end_time": "2021-02-04T01:59:23.486021",
     "exception": false,
     "start_time": "2021-02-04T01:52:57.615528",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "划分数据集...\n",
      "开始训练gbdt..\n",
      "[LightGBM] [Warning] lambda_l1 is set=0.0037465811711039068, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.0037465811711039068\n",
      "[LightGBM] [Warning] bagging_fraction is set=0.5916401743618064, subsample=1.0 will be ignored. Current value: bagging_fraction=0.5916401743618064\n",
      "[LightGBM] [Warning] feature_fraction is set=0.5970610375228046, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.5970610375228046\n",
      "[LightGBM] [Warning] bagging_freq is set=3, subsample_freq=0 will be ignored. Current value: bagging_freq=3\n",
      "[LightGBM] [Warning] lambda_l2 is set=2.527392199457406, reg_lambda=0.0 will be ignored. Current value: lambda_l2=2.527392199457406\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttrain's auc: 0.550543\ttrain's binary_logloss: 0.690948\tval's auc: 0.526328\tval's binary_logloss: 0.692347\n",
      "[100]\ttrain's auc: 0.556969\ttrain's binary_logloss: 0.689332\tval's auc: 0.529251\tval's binary_logloss: 0.691841\n",
      "[150]\ttrain's auc: 0.563228\ttrain's binary_logloss: 0.687963\tval's auc: 0.531146\tval's binary_logloss: 0.691516\n",
      "[200]\ttrain's auc: 0.56987\ttrain's binary_logloss: 0.686698\tval's auc: 0.533697\tval's binary_logloss: 0.69118\n",
      "[250]\ttrain's auc: 0.575722\ttrain's binary_logloss: 0.685668\tval's auc: 0.534513\tval's binary_logloss: 0.691008\n",
      "[300]\ttrain's auc: 0.580532\ttrain's binary_logloss: 0.684594\tval's auc: 0.534969\tval's binary_logloss: 0.690876\n",
      "[350]\ttrain's auc: 0.585198\ttrain's binary_logloss: 0.683667\tval's auc: 0.535321\tval's binary_logloss: 0.690788\n",
      "[400]\ttrain's auc: 0.590052\ttrain's binary_logloss: 0.682698\tval's auc: 0.536002\tval's binary_logloss: 0.690694\n",
      "[450]\ttrain's auc: 0.594265\ttrain's binary_logloss: 0.681789\tval's auc: 0.536335\tval's binary_logloss: 0.690621\n",
      "[500]\ttrain's auc: 0.59761\ttrain's binary_logloss: 0.680989\tval's auc: 0.536772\tval's binary_logloss: 0.690564\n",
      "[550]\ttrain's auc: 0.601344\ttrain's binary_logloss: 0.680147\tval's auc: 0.537519\tval's binary_logloss: 0.690489\n",
      "[600]\ttrain's auc: 0.605172\ttrain's binary_logloss: 0.679355\tval's auc: 0.537875\tval's binary_logloss: 0.69046\n",
      "[650]\ttrain's auc: 0.60867\ttrain's binary_logloss: 0.678612\tval's auc: 0.538034\tval's binary_logloss: 0.69044\n",
      "[700]\ttrain's auc: 0.611494\ttrain's binary_logloss: 0.677911\tval's auc: 0.53821\tval's binary_logloss: 0.690412\n",
      "[750]\ttrain's auc: 0.614723\ttrain's binary_logloss: 0.677136\tval's auc: 0.538201\tval's binary_logloss: 0.6904\n",
      "[800]\ttrain's auc: 0.618082\ttrain's binary_logloss: 0.676445\tval's auc: 0.538461\tval's binary_logloss: 0.690385\n",
      "[850]\ttrain's auc: 0.62076\ttrain's binary_logloss: 0.675809\tval's auc: 0.5383\tval's binary_logloss: 0.690403\n",
      "Early stopping, best iteration is:\n",
      "[794]\ttrain's auc: 0.617649\ttrain's binary_logloss: 0.676532\tval's auc: 0.538544\tval's binary_logloss: 0.690378\n",
      "CPU times: user 23min 26s, sys: 2.63 s, total: 23min 28s\n",
      "Wall time: 6min 25s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# 划分数据集\n",
    "print('划分数据集...')\n",
    "x_train = df_train[features]\n",
    "x_val = df_test[features]\n",
    "y_train = df_train['action']\n",
    "y_val = df_test['action']\n",
    "print('开始训练gbdt..')\n",
    "gbm = lgb.LGBMRegressor(objective='binary',\n",
    "                        lambda_l1=0.0037465811711039068,\n",
    "                        lambda_l2=2.527392199457406,\n",
    "                        num_leaves=61,\n",
    "                        feature_fraction=0.5970610375228046,\n",
    "                        bagging_fraction=0.5916401743618064,\n",
    "                        bagging_freq=3,\n",
    "                        min_child_samples=29,\n",
    "                        learning_rate=0.01,\n",
    "                        n_estimators=1000,\n",
    "                        random_state=42,\n",
    "                         #device='gpu'\n",
    "                        )\n",
    "\n",
    "model=gbm.fit(x_train, y_train,\n",
    "            eval_set = [(x_train, y_train), (x_val, y_val)],\n",
    "            eval_names = ['train', 'val'],\n",
    "            eval_metric = 'auc',\n",
    "            early_stopping_rounds = 100,\n",
    "            verbose=50,\n",
    "            )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-02-04T01:59:23.533448Z",
     "iopub.status.busy": "2021-02-04T01:59:23.532571Z",
     "iopub.status.idle": "2021-02-04T02:03:02.681766Z",
     "shell.execute_reply": "2021-02-04T02:03:02.682535Z"
    },
    "papermill": {
     "duration": 219.176854,
     "end_time": "2021-02-04T02:03:02.682793",
     "exception": false,
     "start_time": "2021-02-04T01:59:23.505939",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12min 35s, sys: 961 ms, total: 12min 36s\n",
      "Wall time: 3min 39s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import janestreet\n",
    "env = janestreet.make_env() # initialize the environment\n",
    "iter_test = env.iter_test() # an iterator which loops over the test set\n",
    "for (test_df, sample_prediction_df) in iter_test:\n",
    "    wt = test_df.iloc[0].weight\n",
    "    if(wt == 0):\n",
    "        sample_prediction_df.action = 0 \n",
    "    else:\n",
    "        X_test = test_df.loc[:, features].values\n",
    "        action = model.predict(X_test)\n",
    "        if (action > 0.5):\n",
    "            sample_prediction_df.action = 1\n",
    "        else:\n",
    "            sample_prediction_df.action = 0  \n",
    "    env.predict(sample_prediction_df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 1111.359605,
   "end_time": "2021-02-04T02:03:03.717872",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2021-02-04T01:44:32.358267",
   "version": "2.2.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
